1 Configuracion inicial

  1. Se descargaron las siguientes paqueterias de R y Bioconductor.
  2. Se descarg?? datos de Hi-C 10kb de resoluci??n de la base de datos PsychENCODE (Promoter-anchored_chromatin_loop.bed)
  3. Se descargaron datos de expresion de single cell desde PsychENCODE
  4. Se descargaron datos de expresion de cerebros en desarrollo desde BrainSpan
  5. Se descargaron las coordenadas de exones (Gencode19_exon.bed y Gencode19_promoter.bed) desde Gencode version 19. Los promotores se definieron 2kb arriba de TSS. El formato es cromosoma, inicio, fin, y gen. 6. Se descargaron datos de anotacion (geneAnno.rds) desde Biomart. Este archivo sirve para hacer el match de genes basados en Ensembl IDs y HUGO Gene Nomenclature Committee (HGNC) symbol.

Todos los datos se tomaron desde muestras postmortem neurotipicos.

library("GenomicRanges")
library("biomaRt")
library("WGCNA")
library("reshape")
library("corrplot")
library("gProfileR")
library("tidyverse")
library("ggpubr")

2 Generacion de un objeto GRanges para los SNPs

credSNP <- vroom::vroom("datos_newAnalysis/AD_complete.csv")
## Rows: 477
## Columns: 71
## Delimiter: ","
## chr [ 5]: SNP, Ancestral, Minor, CONTEXT, Class
## dbl [66]: Global, Global_MAF, ACB, ACB_MAF, AFR, AFR_MAF, AMR, AMR_MAF, ASW, ASW_MAF, BEB,...
## 
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
credranges <- GRanges(credSNP$seq_region_name, IRanges(credSNP$start, credSNP$start),
                      rsid = credSNP$SNP, Context=credSNP$CONTEXT)

credranges
## GRanges object with 477 ranges and 2 metadata columns:
##         seqnames    ranges strand |        rsid                Context
##            <Rle> <IRanges>  <Rle> | <character>            <character>
##     [1]        8  94979792      * |  rs10098778         intron_variant
##     [2]        8  27354759      * |  rs10109834         intron_variant
##     [3]       19  44903416      * |     rs10119    3_prime_UTR_variant
##     [4]        9   2844360      * |  rs10122329 regulatory_region_va..
##     [5]        2 233575317      * |  rs10182651     intergenic_variant
##     ...      ...       ...    ... .         ...                    ...
##   [473]       19   3405594      * |   rs9749589         intron_variant
##   [474]       11  60156035      * |    rs983392     intergenic_variant
##   [475]        3 190951729      * |   rs9877502     intergenic_variant
##   [476]       17  75022679      * |   rs9899728     intergenic_variant
##   [477]       17   5081152      * |   rs9916042         intron_variant
##   -------
##   seqinfo: 22 sequences from an unspecified genome; no seqlengths

3 Mapeo por posicion

Primero se cargan las regiones promotoras y exonicas para generar un objeto GRanges

exon <- read.table("data/Gencode19_exon.bed")
exonranges <- GRanges(exon[,1],IRanges(exon[,2], exon[,3]), gene = exon[,4])
promoter <- read.table("data/Gencode19_promoter.bed")
promoterranges <- GRanges(promoter[,1], IRanges(promoter[,2], promoter[,3]), gene = promoter[,4])

Se hace un sobrelape de los objetos GRanges de SNPs con exones

olap <- findOverlaps(credranges, exonranges)
credexon <- credranges[queryHits(olap)]
mcols(credexon) <- cbind(mcols(credexon), mcols(exonranges[subjectHits(olap)]))

Se hace un sobrelape de los objetos GRanges de SNPs con regiones promotoras

olap <- findOverlaps(credranges, promoterranges)
credpromoter <- credranges[queryHits(olap)]
mcols(credpromoter) <- cbind(mcols(credpromoter), mcols(promoterranges[subjectHits(olap)]))

Se unen los SNPs a sus supiestos genes diana usando interacciones de cromatina. Se cargan datos Hi-C y se genera un objeto GRange.

hic <- read.table("data/Promoter-anchored_chromatin_loops.bed", skip=1)
colnames(hic) <- c("chr", "TSS_start", "TSS_end", "Enhancer_start", "Enhancer_end")
hicranges <- GRanges(hic$chr, IRanges(hic$TSS_start, hic$TSS_end), enhancer=hic$Enhancer_start)
olap <- findOverlaps(hicranges, promoterranges)
hicpromoter <- hicranges[queryHits(olap)]
mcols(hicpromoter) <- cbind(mcols(hicpromoter), mcols(promoterranges[subjectHits(olap)]))
hicenhancer <- GRanges(seqnames(hicpromoter), IRanges(hicpromoter$enhancer, hicpromoter$enhancer+10000),
                       gene=hicpromoter$gene)

Se sobrelapan los SNNps con el objeto GRanges de Hi-C

olap <- findOverlaps(credranges, hicenhancer)
credhic <- credranges[queryHits(olap)]
mcols(credhic) <- cbind(mcols(credhic), mcols(hicenhancer[subjectHits(olap)]))

Se seleccionan los genes que resultaron del mapeo anterior. Los genes candidatos resultates para AD son:

ADgenes <- Reduce(union, list(credhic$gene, credexon$gene, credpromoter$gene))

SNPs <- c(credhic$rsid, credexon$rsid, credpromoter$rsid)
SNPs <- unique(SNPs)

Convertir Ensembl Gene ID a HGNC symbol

load("data/geneAnno2.rda")
ADhgnc <- geneAnno1[match(ADgenes, geneAnno1$ensembl_gene_id), "hgnc_symbol"]
ADhgnc <- ADhgnc[ADhgnc!=""]

4 Trajectorias de expresion de desarrollo

Procesamiento de datos de expresion y meta data

datExpr <- read.csv("data/gene_array_matrix_csv/expression_matrix.csv", header = FALSE)
datExpr <- datExpr[,-1]
datMeta <- read.csv("data/gene_array_matrix_csv/columns_metadata.csv")
datProbes <- read.csv("data/gene_array_matrix_csv/rows_metadata.csv")
datExpr <- datExpr[datProbes$ensembl_gene_id!="",]
datProbes <- datProbes[datProbes$ensembl_gene_id!="",]
datExpr.cr <- collapseRows(datExpr, rowGroup = datProbes$ensembl_gene_id,
                           rowID= rownames(datExpr))
datExpr <- datExpr.cr$datETcollapsed
gename <- data.frame(datExpr.cr$group2row)
rownames(datExpr) <- gename$group

datExpr[1:5,1:7]
##                       V2       V3       V4       V5       V6       V7       V8
## ENSG00000000003 10.25030 10.70150 11.08360 10.06610 11.19010 10.44810 10.40350
## ENSG00000000005  3.51709  3.53494  3.72414  3.67140  3.62474  3.62463  3.58473
## ENSG00000000419 10.19980 10.36500 10.61390 10.15490 10.65210  9.92869  9.88633
## ENSG00000000457  5.79191  7.03140  7.18755  6.10701  6.77977  5.33604  6.05875
## ENSG00000000938  5.61206  5.53656  5.22315  5.81295  5.60542  5.66549  5.50235

Se especifican los estadios de desarrollo.

datMeta$Unit <- "Postnatal"
idx <- grep("pcw", datMeta$age)
datMeta$Unit[idx] <- "Prenatal"
idx <- grep("yrs", datMeta$age)
datMeta$Unit[idx] <- "Postnatal"
datMeta$Unit <- factor(datMeta$Unit, levels=c("Prenatal", "Postnatal"))

Se seleccionan las regiones corticales

datMeta$Region <- "SubCTX"
r <- c("A1C", "STC", "ITC", "TCx", "OFC", "DFC",
       "VFC", "MFC", "M1C", "S1C", "IPC", "M1C-S1C",
       "PCx", "V1C", "Ocx")
datMeta$Region[datMeta$structure_acronym %in% r] <- "CTX"
datExpr <- datExpr[,which(datMeta$Region=="CTX")]
datMeta <- datMeta[which(datMeta$Region=="CTX"),]
save(datExpr, datMeta, file="datos_newAnalysis/devExpr.rda")

Se extraen los perfiles de expresion de desarrollo de los genes de riesgo para AD Se hace un heatmap para ver la calidad de los datos

load("datos_newAnalysis/ADgenes.rda")
exprdat <- apply(datExpr[match(ADgenes, rownames(datExpr)),], 2, mean, na.rm=T)
dat <- data.frame(Region=datMeta$Region, Unit=datMeta$Unit, Expr=exprdat)

prueba <- datExpr[match(ADgenes, rownames(datExpr)),]
prueba <- na.omit(prueba)
ADhgnc <- geneAnno1[match(rownames(prueba), geneAnno1$ensembl_gene_id), "hgnc_symbol"]
ADhgnc <- ADhgnc[ADhgnc!=""]

library(RColorBrewer)
library(gplots)
mypalette <- brewer.pal(11,"RdYlBu")
morecols <- colorRampPalette(mypalette)

col.cell <- c("cyan","red")[dat$Unit]
heatmap.2(prueba,col=rev(morecols(50)),trace="none",
          main="Genes de riesgo AD entre las muestras",
          ColSideColors=col.cell,scale="row")

ggplot(dat,aes(x=Unit, y=Expr, fill=Unit, alpha=Unit)) +
  ylab("Normalized expression") + geom_boxplot(outlier.size = NA) +
  ggtitle("Expresion en cerebro: region cortical") + xlab("") +
  scale_alpha_manual(values=c(0.2, 1)) + theme_classic() +
  theme(legend.position="na", text = element_text(size = 20))

5 single cell de la unidad Neurovascular

load("datos_newAnalysis/ADgenes.rda")
load("data/geneAnno2.rda")
targetname <- "AD"
targetgene <- ADhgnc
cellexp <- read.table("data/DER-20_Single_cell_expression_processed_TPM_backup.tsv",header=T,fill=T)
cellexp[1121,1] <- cellexp[1120,1]
cellexp <- cellexp[-1120,]
rownames(cellexp) <- cellexp[,1]
cellexp <- cellexp[,-1]
datExpr <- scale(cellexp,center=T, scale=F)
datExpr <- datExpr[,789:ncol(datExpr)]

exprdat <- apply(datExpr[match(targetgene, rownames(datExpr)),],2,mean,na.rm=T)
dat <- data.frame(Group=targetname, cell=names(exprdat), Expr=exprdat)
dat$celltype <- unlist(lapply(strsplit(dat$cell, split="[.]"),'[[',1))
dat <- dat[-grep("Ex|In",dat$celltype),]
dat$celltype <- gsub("Dev","Fetal",dat$celltype)
dat$celltype <- factor(dat$celltype, levels=c("Neurons","Astrocytes","Microglia","Endothelial",
                                              "Oligodendrocytes","OPC","Fetal"))

ggplot(dat,aes(x=celltype, y=Expr, fill=celltype)) +
  ylab("Normalized expression") + xlab("") +
  geom_violin() + theme_bw() +
  theme(axis.text.x=element_text(angle = 90, hjust=1),
        text = element_text(size=15)) +
  theme(legend.position="none") +
  ggtitle(paste0("Cellular expression profiles of AD risk genes"))

6 Enriquecimiento funcional

library(hypeR)
library(tidyverse)
library(reactable)
genesets <- msigdb_gsets("Homo sapiens", "C2", "CP:REACTOME", clean=TRUE)
print(genesets)
## C2.CP:REACTOME v7.2.1 
## 2 Ltr Circle Formation (7)
## A Tetrasaccharide Linker Sequence Is Required For Gag Synthesis (26)
## Abacavir Metabolism (5)
## Abacavir Transmembrane Transport (5)
## Abacavir Transport And Metabolism (10)
## Abc Family Proteins Mediated Transport (103)
genes <- read.table("datos_newAnalysis/ADgenes.txt")
signatures <- genes$V1
mhyp <- hypeR(signatures, genesets, test="hypergeometric", background=360000)
hyp_dots(mhyp, merge=TRUE, fdr=0.01, top=100, title="Co-expression Modules")

hyp_emap(mhyp)